Introduction

Question: how does differential localization matches with gene expression?

The ENCODE data sets only included H1, K562 and “foreskin fibroblast”. To be able to use data from other cell types, I decided to download the reads and process the files myself. (With help of Federico’s pipeline.) In this way, I was able to include other data sets as well.

Method

ENCODE RNA-seq / CAGE .tsv expression tables.

Set-up

Set the parameters and list the data.

# Load dependencies
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(ggbeeswarm))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(RColorBrewer))



# Load data from the previous module
input_dir_scaling <- "ts180131_normalization"
input_dir_regions <- "ts180131_differential_localization"

samples.df <- readRDS(file.path(input_dir_scaling,
                                "DamID_samples.rds"))

# Only plot the combined data here for simplicity
damid.combined <- readRDS(file.path(input_dir_scaling,
                                  "DamID_GRanges_scaled_combined.rds"))
damid.hmm <- readRDS(file.path(input_dir_scaling,
                               "DamID_HMM.rds"))
regions <- readRDS(file.path(input_dir_regions,
                             "DamID_GRangesList_regionsCombined.rds"))

regions.LADs <- readRDS(file.path(input_dir_regions,
                             "DamID_GRangesList_regionsLADs.rds"))
regions.NADs <- readRDS(file.path(input_dir_regions,
                             "DamID_GRangesList_regionsNADs.rds"))

# Set bin size
bin_size <- "80kb"

# Prepare output 
output_dir <- "ts180405_GeneExpression_Remapped"
dir.create(output_dir, showWarnings = FALSE)

RNA-seq

First, I need to load the RNA-seq data. I will load in ENCODE FPKM values and the new gene counts from the pipeline.

Before working with the remapped data, I want to note that these are absolute gene counts instead of FPKM values. Furthermore, currently I’ve treated all samples as “unstranded” even though some clearly are stranded.

# Import the genes
genes <- import.gff("ts180315_GeneExpression/gencode.v24.primary_assembly.annotation.gtf.gz")
genes <- genes[mcols(genes)$type == "gene"]

mcols(genes) <- mcols(genes)[, c("gene_id", "gene_name", "gene_type")]

# Read in RNA-seq data sets, this is gene counts
rnaseq_star.se <- read.delim("~/mydata/proj/3D_nucleus/data/external/ts180329_RNAseq_4DNcells/results_se/Star/count_table.tsv",
                             header = T, stringsAsFactors = FALSE)

rnaseq_star.pe <- read.delim("~/mydata/proj/3D_nucleus/data/external/ts180329_RNAseq_4DNcells/results_pe/Star/count_table.tsv",
                             header = T, stringsAsFactors = FALSE)
rnaseq_star.pe <- rnaseq_star.pe[, grep("\\.1$", names(rnaseq_star.pe), invert = T)]

rnaseq_star <- cbind(rnaseq_star.se, rnaseq_star.pe[2:ncol(rnaseq_star.pe)])

row.names(rnaseq_star) <- rnaseq_star$sample.id
rnaseq_star <- rnaseq_star[, 2:ncol(rnaseq_star)]

# Filter the worst samples
to_filter <- c("ENCFF000DJL_H1.hESC_NoDepletion_NotStranded_PE_r1", 
               "GSM1697490_WT..untreated_Hap1_NotDepleted_Stranded_PE")
rnaseq_star <- rnaseq_star[, - which(names(rnaseq_star) %in% to_filter)]

# Prepare a sample data frame
cells <- c("H1", "K562", "GM12878", "Hap1", "KBM7", "HEPG2", "HCT116", "RPE", "IMR.90", "HFF")
samples_star.df <- data.frame(samples = names(rnaseq_star),
                              stringsAsFactors = FALSE)

samples_star.df$cell <- ""
for (cell in cells) {
  samples_star.df$cell[grep(paste0("_", toupper(cell)), 
                            toupper(samples_star.df$samples))] <- cell
}

samples_star.df$stranded <- ifelse(grepl("NotStranded", samples_star.df$samples),
                                   "NotStranded", "Stranded")
samples_star.df$rRNADepleted <- ifelse(grepl("NotDepleted", samples_star.df$samples),
                                       "NotDepleted", "rDNADepleted")
samples_star.df$encode <- ifelse(grepl("ENC", samples_star.df$samples),
                                 "ENCODE", "NotENCODE")

samples_star.df$cell <- factor(samples_star.df$cell, 
                               levels = cells)


# Filter for lincRNA and protein_coding genes only
keep <- mcols(genes)$gene_type %in% c("protein_coding", "lincRNA")
genes <- genes[keep]
rnaseq_star <- rnaseq_star[keep, ]

#genes <- genes[mcols(genes)$gene_type %in% c("protein_coding", "lincRNA")]
#genes <- genes[mcols(genes)$gene_type %in% c("protein_coding")]


# Load this into DESeq2
rnaseq_dds <- DESeqDataSetFromMatrix(countData = rnaseq_star,
                                     colData = samples_star.df,
                                     design= ~ cell)
rnaseq_dds <- DESeq(rnaseq_dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 34 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# Get the "transformed" values and create PCA plot
rnaseq_dds.vsd <- vst(rnaseq_dds, blind=FALSE)
rnaseq_dds.vsd <- normTransform(rnaseq_dds, pc = 0.001)
plotPCA(rnaseq_dds.vsd, intgroup=c("cell"))

Okay, now we have the RNAseq samples (gene counts) loaded the PCA plot confirms that the samples are not too bad. Note that you do have some strange outliers, but I don’t know why and how right now.

Differential localization versus gene expression

Time to start asking interesting questions:

  • In the different cell types, what is the expression range within and outside of associated domains (LADs / NADs)?
  • In the different cell types, how is the expression range affected by the differential localization calculated with the DamID data?

Again, I need to prepare this by first matching the gene_ids to genomic locations.

# First, I will create a per-cell mean score
rnaseq_dds.assay <- data.frame(assay(rnaseq_dds.vsd))

rnaseq_dds.assay.mean <- data.frame(ensembl = row.names(rnaseq_dds.assay))
rnaseq_dds.assay.mean <- cbind(rnaseq_dds.assay.mean,
                               do.call(cbind, tapply(samples_star.df$samples,
                                                     samples_star.df$cell,
                                                     function(x) {
                                                       if (length(x) > 1) {
                                                         rowMeans(rnaseq_dds.assay[, x]) 
                                                       } else {
                                                         rnaseq_dds.assay[, x]
                                                       }})))



# Add the rnaseq scores
mcols(genes) <- cbind(mcols(genes), rnaseq_dds.assay.mean)

So, we have:

  • GRanges of the regions (LADs / NADs) and their preferential localization.
  • GRanges of the genes and their expression in the various cell lines.

Next, I want to look at the expression ranges of the genes within the compartments, and how this changes with the differential localization.

# I want to link the RNAseq to the DamID scores
# For now and for simplicity, I will simply use the TSS location to determine
# the DamID bin to use.

# For each cell type, add region information
plotExpression <- function(rnaseq.mean, regions, cell, 
                           ylim = NULL, gene_filter = NULL) {
  
  # Start with a fresh rnaseq object
  rnaseq.cell <- rnaseq.mean
  
  # Filter for gene type if given
  if (! is.null(gene_filter)) {
    rnaseq.cell <- genes[mcols(genes)$gene_type %in% gene_filter]
  }
  
  # Find overlaps
  ovl <- findOverlaps(rnaseq.cell, regions[[cell]], type = "within")
  
  # Add region information
  mcols(rnaseq.cell)$within_region <- FALSE
  mcols(rnaseq.cell)$within_region[queryHits(ovl)] <- TRUE
  
  mcols(rnaseq.cell)$difference <- NA
  mcols(rnaseq.cell)$difference[queryHits(ovl)] <- mcols(regions[[cell]])$difference[subjectHits(ovl)]
  
  mcols(rnaseq.cell)$differential <- "iLAD"
  mcols(rnaseq.cell)$differential[queryHits(ovl)] <- mcols(regions[[cell]])$differential[subjectHits(ovl)]
  mcols(rnaseq.cell)$differential[is.na(mcols(rnaseq.cell)$differential)] <- "-"
  mcols(rnaseq.cell)$differential <- factor(mcols(rnaseq.cell)$differential, 
                                            levels = c("4xAP3", "-", "LMNB1", "iLAD"))
  
  # Extract the mcols and plot
  x <- as(mcols(rnaseq.cell), "data.frame")
  x$cell <- x[, cell]
  
  #plt <- ggplot(x, aes(x = differential, y = log2(cell + pseudo), col = differential)) + 
  plt <- ggplot(x, aes(x = differential, y = cell, col = differential)) + 
    geom_quasirandom(alpha = 1) +
    geom_boxplot(col = "black", outlier.shape = NA, fill = NA) +
    ggtitle(cell) + 
    xlab("") +
    ylab("rlog(expression)") +
    theme_bw()
  
  if (! is.null(ylim)) {
    plt <- plt + ylim(ylim[1], ylim[2])
  }
  
  plt
  
}
# For each cell type, add region information
plotRegionsWithExpression <- function(rnaseq.mean, regions, cell, 
                                      gene_filter = NULL, polygon = T) {
  
  # Start with a fresh rnaseq object
  rnaseq.cell <- rnaseq.mean
  
  # Filter for gene type if given
  if (! is.null(gene_filter)) {
    rnaseq.cell <- genes[mcols(genes)$gene_type %in% gene_filter]
  }
  
  # Find overlaps
  ovl <- findOverlaps(rnaseq.cell, regions[[cell]], type = "within")
  
  # Add region information
  mcols(rnaseq.cell)$within_region <- FALSE
  mcols(rnaseq.cell)$within_region[queryHits(ovl)] <- TRUE
  
  mcols(rnaseq.cell)$difference <- NA
  mcols(rnaseq.cell)$difference[queryHits(ovl)] <- mcols(regions[[cell]])$difference[subjectHits(ovl)]
  
  mcols(rnaseq.cell)$differential <- "iLAD"
  mcols(rnaseq.cell)$differential[queryHits(ovl)] <- mcols(regions[[cell]])$differential[subjectHits(ovl)]
  mcols(rnaseq.cell)$differential[is.na(mcols(rnaseq.cell)$differential)] <- "-"
  mcols(rnaseq.cell)$differential <- factor(mcols(rnaseq.cell)$differential, 
                                            levels = c("4xAP3", "-", "LMNB1", "iLAD"))
  
  mcols(rnaseq.cell)$cell_LMNB1 <- NA
  mcols(rnaseq.cell)$cell_LMNB1[queryHits(ovl)] <- mcols(regions[[cell]])[, paste0(cell, "_LMNB1")][subjectHits(ovl)]
  
  mcols(rnaseq.cell)$cell_4xAP3 <- NA
  mcols(rnaseq.cell)$cell_4xAP3[queryHits(ovl)] <- mcols(regions[[cell]])[, paste0(cell, "_4xAP3")][subjectHits(ovl)]
  
  # Extract the mcols and plot
  x <- as(mcols(rnaseq.cell), "data.frame")
  x$cell <- x[, cell]
  
  # Plot
  limits <- c(min(rnaseq.cell$cell_LMNB1, rnaseq.cell$cell_4xAP3, na.rm = T),
              max(rnaseq.cell$cell_LMNB1, rnaseq.cell$cell_4xAP3, na.rm = T))
  
  if (polygon) {
    min.score <- 0.5
    min.difference <- 0.25
    
    polygon.LAD <- data.frame(x = c(min.score, min.score, limits[2], limits[2]),
                              y = c(limits[1], 0 + min.score - min.difference, limits[2] - min.difference, limits[1]))
    polygon.NAD <- data.frame(x = c(limits[1], 0 + min.score - min.difference, limits[2] - min.difference, limits[1]),
                              y = c(min.score, min.score, limits[2], limits[2]))
  }
  
  plt <- ggplot(x, aes(x = cell_LMNB1, y = cell_4xAP3, col = cell)) + 
    geom_abline(lty = 2)
  
  if (polygon) {
    plt <- plt + 
      geom_polygon(data = polygon.LAD, aes(x = x, y = y), col = NA, fill = "blue", alpha = 0.1) +
      geom_polygon(data = polygon.NAD, aes(x = x, y = y), col = NA, fill = "green", alpha = 0.1)
  }
  
  plt <- plt +
    geom_jitter(alpha = 1, size = 0.8, width = 0.05, height = 0.05) +
    ggtitle(cell) + 
    xlab("LMNB1") +
    ylab("4xAP3") +
    xlim(limits) + ylim(limits) +
    # scale_color_gradient2(low = "blue", mid = "grey", high = "red", midpoint = 0) +
    scale_colour_gradientn(colors = colorRamps::matlab.like(9)[c(1:4, 6:9)]) +
    theme_bw() +
    theme(aspect.ratio = 1)
  
  plt
  
}
plotDamIDWithExpression <- function(rnaseq.mean, damid.combined, cell, 
                                    gene_filter = NULL, alpha = 1, size = 0.5,
                                    polygon = T) {
  
  # Start with a fresh rnaseq object
  rnaseq.cell <- rnaseq.mean
  
  # Filter for gene type if given
  if (! is.null(gene_filter)) {
    rnaseq.cell <- genes[mcols(genes)$gene_type %in% gene_filter]
  }
  
  # Find overlaps
  tss <- rnaseq.cell
  start(tss) <- end(tss) <- ifelse(strand(rnaseq.cell) == "+",
                                   start(tss), end(tss))
  
  # ovl <- findOverlaps(rnaseq.cell, damid.combined, type = "within")
  ovl <- findOverlaps(tss, damid.combined, type = "within")
  
  # Add region information
  mcols(rnaseq.cell)$cell_LMNB1 <- NA
  mcols(rnaseq.cell)$cell_LMNB1[queryHits(ovl)] <- mcols(damid.combined)[, paste0(cell, "_LMNB1")][subjectHits(ovl)]
  
  mcols(rnaseq.cell)$cell_4xAP3 <- NA
  mcols(rnaseq.cell)$cell_4xAP3[queryHits(ovl)] <- mcols(damid.combined)[, paste0(cell, "_4xAP3")][subjectHits(ovl)]
  
  # Extract the mcols and plot
  x <- as(mcols(rnaseq.cell), "data.frame")
  x$cell <- x[, cell]
  
  # Plot
  limits <- c(min(rnaseq.cell$cell_LMNB1, rnaseq.cell$cell_4xAP3, na.rm = T),
              max(rnaseq.cell$cell_LMNB1, rnaseq.cell$cell_4xAP3, na.rm = T))
  
  if (polygon) {
    min.score <- 0.5
    min.difference <- 0.25
    
    polygon.LAD <- data.frame(x = c(min.score, min.score, limits[2], limits[2]),
                              y = c(limits[1], 0 + min.score - min.difference, limits[2] - min.difference, limits[1]))
    polygon.NAD <- data.frame(x = c(limits[1], 0 + min.score - min.difference, limits[2] - min.difference, limits[1]),
                              y = c(min.score, min.score, limits[2], limits[2]))
  }
  
  
  plt <- ggplot(x, aes(x = cell_LMNB1, y = cell_4xAP3, col = cell)) + 
    geom_abline(lty = 2)
  
  if (polygon) {
    plt <- plt + 
      geom_polygon(data = polygon.LAD, aes(x = x, y = y), col = NA, fill = "blue", alpha = 0.1) +
      geom_polygon(data = polygon.NAD, aes(x = x, y = y), col = NA, fill = "green", alpha = 0.1)
  }
  
  plt <- plt +
    geom_point(alpha = alpha, size = size) +
    ggtitle(cell) + 
    xlab("LMNB1") +
    ylab("4xAP3") +
    xlim(limits) + ylim(limits) +
    # scale_color_gradient2(low = "blue", mid = "grey", high = "red", midpoint = 0) +
    scale_colour_gradientn(colors = colorRamps::matlab.like(9)[c(1:4, 6:9)]) +
    theme_bw() +
    theme(aspect.ratio = 1)
  
  plt
  
}


cell <- "H1"
plotExpression(genes, regions, cell)

plotExpression(genes, regions, cell, gene_filter = "protein_coding")

plotRegionsWithExpression(genes, regions, cell, gene_filter = "protein_coding")
## Warning: Removed 12323 rows containing missing values (geom_point).

plotDamIDWithExpression(genes, damid.combined, cell, gene_filter = "protein_coding")
## Warning: Removed 177 rows containing missing values (geom_point).

cell <- "K562"
plotExpression(genes, regions, cell)

plotExpression(genes, regions, cell, gene_filter = "protein_coding")

plotRegionsWithExpression(genes, regions, cell, gene_filter = "protein_coding")
## Warning: Removed 15150 rows containing missing values (geom_point).

plotDamIDWithExpression(genes, damid.combined, cell, gene_filter = "protein_coding")
## Warning: Removed 166 rows containing missing values (geom_point).

cell <- "Hap1"
plotExpression(genes, regions, cell)

plotExpression(genes, regions, cell, gene_filter = "protein_coding")

plotRegionsWithExpression(genes, regions, cell, gene_filter = "protein_coding")
## Warning: Removed 15169 rows containing missing values (geom_point).

plotDamIDWithExpression(genes, damid.combined, cell, gene_filter = "protein_coding")
## Warning: Removed 177 rows containing missing values (geom_point).

cell <- "HCT116"
plotExpression(genes, regions, cell)

plotExpression(genes, regions, cell, gene_filter = "protein_coding")

plotRegionsWithExpression(genes, regions, cell, gene_filter = "protein_coding")
## Warning: Removed 15610 rows containing missing values (geom_point).

plotDamIDWithExpression(genes, damid.combined, cell, gene_filter = "protein_coding")
## Warning: Removed 134 rows containing missing values (geom_point).

cell <- "RPE"
plotExpression(genes, regions, cell)

plotExpression(genes, regions, cell, gene_filter = "protein_coding")

plotRegionsWithExpression(genes, regions, cell, gene_filter = "protein_coding")
## Warning: Removed 13730 rows containing missing values (geom_point).

plotDamIDWithExpression(genes, damid.combined, cell, gene_filter = "protein_coding")
## Warning: Removed 156 rows containing missing values (geom_point).

cell <- "HFF"
plotExpression(genes, regions, cell)

plotExpression(genes, regions, cell, gene_filter = "protein_coding")

plotRegionsWithExpression(genes, regions, cell, gene_filter = "protein_coding")
## Warning: Removed 14902 rows containing missing values (geom_point).

plotDamIDWithExpression(genes, damid.combined, cell, gene_filter = "protein_coding")
## Warning: Removed 127 rows containing missing values (geom_point).

Well, that’s something. Based on the RNA-seq data, it almost seems as if the nucleolus is the more repressive compartment. Or, to put it differently, preferential localization as I determine it is a better predictor of gene repression for the nucleolus than for the lamina. Overall, you do see the expected drop from “NA” (= iLADs) into the regions. Interestingly, I keep seeing that nucleolus is lower. Even though these regions are almost completely mirrors between H1/K562 and HFF.

Note that this is only with protein-coding genes! When including more things, as lincRNAs, I realized that the numbers of 0s became (too) high.

Scatterplots

Until now, I’ve only been showing the data based on the binary division. What happens when I make a scatterplot of the difference vs expression levels?

Using actual DamID scores per gene

A big problem I’ve realized with this work is that the regions are very large and imprecise. In other words, many of the genes still expressed are often near borders or other regions with very low DamID signal. Indicating that these genes are not so much near the compartment in the first place.

A better solution is to use the actual DamID data. When I calculate the mean interaction score for each gene instead of using the region score, it should give insight to which extend this gene is “within” the compartment.

Candidate genes

So far, I’ve looked at gene expression within the differential regions. To understand the process of this differential localization and why HFF seems to be more heavily nucleolus and the other two more lamina, I can look at gene expression as well. Several genes have already been implicated to cause a similar effect during senescence. Can I also find the difference in expression here?

#cells <- c("H1", "K562", "GM12878", "Hap1", "KBM7", "HEPG2", "HCT116", "RPE", "IMR.90", "HFF")
cells <- c("H1", "K562", "Hap1", "HCT116", "RPE", "HFF")

candidate_genes <- c("LMNB1", "LMNB2", "LMNA", "LBR", "EHMT2", "EZH2", "EMD", "LEMD1", "LEMD2", "LEMD3", "TMPO", "ANKLE1", "ANKLE2",
                     "HDAC3", "GCFC2", "SUN1", "SUN2", "BANF1", "BCLAF1", "MKL1", "CBX5", "CBX1", "CBX3")
# candidate_genes <- c("SMC1A", "SMC2", "SMC3", "SMC4", "WAPL", "MED12", "CTCF")

candidate_genes.df <- data.frame(gene_name = candidate_genes,
                                 stringsAsFactors = FALSE)

# Determinet the correct overlap
idx <- match(candidate_genes.df$gene_name,
             mcols(genes)$gene_name)

candidate_genes.df$gene_id <- mcols(genes)$gene_id[idx]

# Add the data
candidate_genes.df <- cbind(candidate_genes.df, rnaseq_dds.assay[idx, ])

# Melt the data
candidate_genes.df.melt <- melt(candidate_genes.df, 
                                id.vars = c("gene_id", "gene_name"))

idx <- match(candidate_genes.df.melt$variable, samples_star.df$samples)

candidate_genes.df.melt$gene_name <- factor(candidate_genes.df.melt$gene_name,
                                            levels = candidate_genes)

candidate_genes.df.melt$stranded <- as.character(samples_star.df$stranded[idx])

candidate_genes.df.melt$rRNADepleted <- as.character(samples_star.df$rRNADepleted[idx])


candidate_genes.df.melt$cell <- as.character(samples_star.df$cell[idx])
candidate_genes.df.melt <- candidate_genes.df.melt[candidate_genes.df.melt$cell %in% cells, ]
candidate_genes.df.melt$cell <- factor(candidate_genes.df.melt$cell, 
                                       levels = cells)

# Plot with ggplot
ggplot(candidate_genes.df.melt, aes(x = cell, y = value, color = cell)) +
  geom_quasirandom(aes(shape = stranded)) +
  facet_wrap(~ gene_name, scales = "free_y") +
  geom_boxplot(fill = NA, col = "black", size = 0.5, outlier.colour = NA) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

This is just a quick selection of candidate genes. However, you can observe the expected: for the more differentiated or senescent cells, you have higher expression of LMNA and lower expression of LMNB1, together with expression differences in for example LBR and EHMT2. Now, to which extend can we explain the observed differences in localization with these expression differences?

What I think I need to do:

  • Measure expression levels of all cell types. HFF seems near the end-point of differentiation with many LADs, H3K9me3 regions and a strong nucleolar signal. As you can see, this goes together with a low LMNB1 score and a high LMNA score. Could it be that HCT116 and RPE are somehow intermediates with different balances of LMNB1 / LMNA and therefore explain the observed results?
  • Reduce expression levels of certain candidates in Hap1 (LMNB1, EHMT2) and increase expression of others (LMNA), and see what happens.
  • Possibly the same thing in HFF.

In this way, by disturbing things one gene at a time, I hope to understand the differences in observed behaviour.

Gene expression for each chromosome

The DamID data has shown that specific chromosomes have different lamina / nucleolus balances, that vary per cell type. Overall, active chromosomes have been linked to the nuclear interior with inactive chromosomes near the lamina. Does this hold true here?

# First, for all genes
cells <- c("H1", "K562", "Hap1", "HCT116", "RPE", "HFF")
genes.df <- as(genes, "data.frame")
genes.df <- genes.df[genes.df$gene_type == "protein_coding", ]

genes.df <- genes.df[, c("seqnames", cells)]
genes.df <- genes.df[genes.df$seqnames %in% c(paste0("chr", 1:22), "chrX"), ]
genes.df <- melt(genes.df, id.vars = "seqnames")

ggplot(genes.df, aes(x = seqnames, y = value, col = variable)) +
  #geom_violin() +
  #geom_boxplot(outlier.shape = NA, width = 0.1, fill = NA) +
  geom_quasirandom(size = 0.5) +
  facet_grid(variable ~ .) +
  xlab("") +
  ylab("rlog(expression)") +
  ggtitle("All genes") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Second, for LAD-genes
cLADs <- intersect(regions[["H1"]], regions[["K562"]])
cLADs <- intersect(cLADs, regions[["Hap1"]])
cLADs <- intersect(cLADs, regions[["HCT116"]])
cLADs <- intersect(cLADs, regions[["RPE"]])
cLADs <- intersect(cLADs, regions[["HFF"]])

genes.df.cLADs <- as(genes[overlapsAny(genes, cLADs)],
                   "data.frame")

genes.df.cLADs <- genes.df.cLADs[genes.df.cLADs$gene_type == "protein_coding", ]

genes.df.cLADs <- genes.df.cLADs[, c("seqnames", cells)]
genes.df.cLADs <- genes.df.cLADs[genes.df.cLADs$seqnames %in% c(paste0("chr", 1:22), "chrX"), ]
genes.df.cLADs <- melt(genes.df.cLADs, id.vars = "seqnames")

ggplot(genes.df.cLADs, aes(x = seqnames, y = value, col = variable)) +
  #geom_violin() +
  #geom_boxplot(outlier.shape = NA, width = 0.1, fill = NA) +
  geom_quasirandom(size = 0.5) +
  facet_grid(variable ~ .) +
  xlab("") +
  ylab("rlog(expression)") +
  ggtitle("cLAD genes") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Expression bed files; and save expression matrix

For visualization, I would like to have a bed file with expression score that can be visualized easily in IGV.

# Prepare output
bed_out <- file.path(output_dir, "expression_bed")
dir.create(bed_out, showWarnings = FALSE)

# Set seqinfo from genes object for later .bw files
seqinfo <- read.table("~/mydata/data/genomes/GRCh38/hg38.chrom.sizes", sep = "\t")
idx <- match(seqlevels(genes), seqinfo[, 1])
seqlengths(genes) <- seqinfo[match(seqlevels(genes), seqinfo[, 1]), 2]

# Only use the start of the gene for visualization and remove dupicates
genes_start <- genes
start(genes_start) <- end(genes_start) <- ifelse(strand(genes) == "+", start(genes), end(genes))
genes_start <- genes_start[which(! duplicated(data.frame(seqnames(genes_start), start(genes_start))))]

for (cell in cells) {
  
  genes_bed <- genes_start
  
  # Filter for protein-coding for now
  genes_bed <- genes_bed[mcols(genes_bed)$gene_type == "protein_coding"]
  
  mcols(genes_bed) <- mcols(genes_bed)[, c(cell, "gene_name")]
  names(mcols(genes_bed)) <- c("score", "gene_name")
  
  # Export as .bed
  export.bed(genes_bed, file.path(bed_out,
                                  paste0(cell, "_rlog.bed")))
  export.bw(genes_bed, file.path(bed_out,
                                 paste0(cell, "_rlog.bw")))
  
}

# Save expression matrix
saveRDS(genes, 
        file.path(output_dir,
                  "genes_expression.rds"))

SessionInfo

sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.5 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-2         DESeq2_1.18.1             
##  [3] SummarizedExperiment_1.8.1 DelayedArray_0.4.1        
##  [5] matrixStats_0.53.1         Biobase_2.38.0            
##  [7] ggbeeswarm_0.6.0           reshape2_1.4.3            
##  [9] ggplot2_2.2.1              rtracklayer_1.38.3        
## [11] GenomicRanges_1.30.1       GenomeInfoDb_1.14.0       
## [13] IRanges_2.12.0             S4Vectors_0.16.0          
## [15] BiocGenerics_0.24.0       
## 
## loaded via a namespace (and not attached):
##  [1] bit64_0.9-7              splines_3.4.4           
##  [3] Formula_1.2-2            latticeExtra_0.6-28     
##  [5] blob_1.1.0               GenomeInfoDbData_1.0.0  
##  [7] vipor_0.4.5              Rsamtools_1.30.0        
##  [9] yaml_2.1.19              pillar_1.1.0            
## [11] RSQLite_2.0              backports_1.1.2         
## [13] lattice_0.20-35          digest_0.6.15           
## [15] XVector_0.18.0           checkmate_1.8.5         
## [17] colorspace_1.3-2         htmltools_0.3.6         
## [19] Matrix_1.2-14            plyr_1.8.4              
## [21] XML_3.98-1.11            genefilter_1.60.0       
## [23] zlibbioc_1.24.0          xtable_1.8-2            
## [25] scales_0.5.0             BiocParallel_1.12.0     
## [27] htmlTable_1.11.2         tibble_1.4.1            
## [29] annotate_1.56.2          nnet_7.3-12             
## [31] lazyeval_0.2.1           survival_2.42-6         
## [33] magrittr_1.5             memoise_1.1.0           
## [35] evaluate_0.10.1          foreign_0.8-70          
## [37] beeswarm_0.2.3           tools_3.4.4             
## [39] data.table_1.10.4-3      stringr_1.3.0           
## [41] locfit_1.5-9.1           munsell_0.4.3           
## [43] cluster_2.0.7-1          AnnotationDbi_1.40.0    
## [45] Biostrings_2.46.0        colorRamps_2.3          
## [47] compiler_3.4.4           rlang_0.1.6             
## [49] grid_3.4.4               RCurl_1.95-4.10         
## [51] rstudioapi_0.7           htmlwidgets_1.0         
## [53] labeling_0.3             bitops_1.0-6            
## [55] base64enc_0.1-3          rmarkdown_1.8           
## [57] gtable_0.2.0             DBI_0.7                 
## [59] GenomicAlignments_1.14.1 gridExtra_2.3           
## [61] knitr_1.18               bit_1.1-12              
## [63] Hmisc_4.1-1              rprojroot_1.3-2         
## [65] stringi_1.1.6            Rcpp_0.12.14            
## [67] geneplotter_1.56.0       rpart_4.1-13            
## [69] acepack_1.4.1